fish_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/Fish/FISH_DATA_21Jul2017.xlsx",sheet="Events")
### Format events to schema
#change all empty columns from logical to character class
fish_events[sapply(fish_events, is.logical)] <- lapply(fish_events[sapply(fish_events, is.logical)], as.character)
#change eventID to just the KANF value
fish_events$eventID<-str_extract(fish_events$eventID,"KANF\\d\\d\\d")
#round the lat/longs to four digits (~ 10m uncertainty)
fish_events$decimalLatitude<-round(as.numeric(fish_events$decimalLatitude),digits = 4)
fish_events$decimalLongitude<-round(as.numeric(fish_events$decimalLongitude),digits = 4)
fish_events$coordinateUncertaintyInMeters<-as.numeric(fish_events$coordinateUncertaintyInMeters)
# class(fish_events$geoReferenceProtocol)
# class(fish_events$maximumDepthInMeters)
# class(fish_events$minimumDepthInMeters)
# class(fish_events$recordedBy)
# class(fish_events$samplingProtocol) #some missing. ask Diane to fill this in
# class(fish_events$habitatBiotic)
#remove carriage returns from Geomorphological Zone
fish_events$habitatGeomorphologicalZone<-str_replace_all(fish_events$habitatGeomorphologicalZone,pattern="\r\n",replacement = "")
fish_events$habitatSubstrate<-str_replace_all(fish_events$habitatSubstrate,pattern="\r\n",replacement = "")
fish_events$year<-as.numeric(fish_events$year)
fish_events$month<-match(fish_events$month,month.abb)
fish_events$day<-as.numeric(fish_events$day)
#Recommended fields
# class(fish_events$eventRemarks)
# class(fish_events$locality)
# class(fish_events$eventRemarks)
# class(fish_events$verbatimCoordinates)
# class(fish_events$eventMedia)
#remove ending times from eventTime field
fish_events$eventTime<-str_extract(string=fish_events$eventTime,pattern="^\\d\\d:\\d\\d")
algae_events<-read_excel("/Users/eric/google_drive/MarineGEO/algae/MarineGEOHI_bioassessment_master_KANA.xlsx", sheet="Station")
### Format events to schema
#change all empty columns from logical to character class
algae_events[sapply(algae_events, is.logical)] <- lapply(algae_events[sapply(algae_events, is.logical)], as.character)
#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
algae_events$decimalLatitude<-round(as.numeric(algae_events$decimalLatitude),digits = 4)
algae_events$decimalLongitude<-abs(round(as.numeric(algae_events$decimalLongitude),digits = 4))*-1
# class(algae_events$eventID)
# class(algae_events$geoReferenceProtocol) #good
# class(algae_events$coordinateUncertaintyInMeters) #good
# class(algae_events$maximumDepthInMeters) #some missing. ask Melinda to fill this in
# class(algae_events$minimumDepthInMeters)#some missing. ask Melinda to fill this in
# class(algae_events$recordedBy) #some missing. ask Melinda to fill this in
# class(algae_events$samplingProtocol) #some missing. ask Melinda to fill this in
# class(algae_events$`habitatBiotic`)
# class(algae_events$`habitatGeomorphologicalZone`)
# class(algae_events$`habitatSubstrate`)
#Recommended fields
# class(algae_events$eventRemarks)
# class(algae_events$locality)
# class(algae_events$eventRemarks)
# class(algae_events$verbatimCoordinates)
# class(algae_events$eventMedia)
#format time correctly
algae_events$eventTime<-as.character(parse_date_time(algae_events$eventTime, orders="ymdHMS", tz="HST"),format="%H:%M")
invert_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/Inverts/BKANE_063017_FIMS.xlsx", sheet = "Station", skip=2)
### Format events to schema
# Note: invert event data need a lot of cleaning, including importing stations from the algae and meiofauna team, and making new stations with "A" or "B" appended. Currently 839 samples do not have station information because of this.
# remove event fields that don't appear in the schema
invert_events<-invert_events[,-grep(pattern = "^X",x = names(invert_events),perl=T)]
#change all empty columns from logical to character class
invert_events[sapply(invert_events, is.logical)] <- lapply(invert_events[sapply(invert_events, is.logical)], as.character)
#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
invert_events$decimalLatitude<-round(as.numeric(invert_events$decimalLatitude),digits = 4)
invert_events$decimalLongitude<-abs(round(as.numeric(invert_events$decimalLongitude),digits = 4))*-1
invert_events$geoReferenceProtocol<-"GPS"
invert_events$coordinateUncertaintyInMeters<-100
# class(invert_events$maximumDepthInMeters) #some missing. ask John to fill this in
# class(invert_events$minimumDepthInMeters)#some missing. ask John to fill this in
#Format the date properly, get rid of the "raw" field
invert_events$day<-day(invert_events$eventDate)
invert_events$month<-month(invert_events$eventDate)
invert_events$year<-year(invert_events$eventDate)
invert_events$eventDate<-NULL
# class(invert_events$recordedBy) #good
# class(invert_events$samplingProtocol) #some missing. ask John to fill this in
# class(invert_events$habitatGeomorphologicalZone) #some missing. ask John to fill this in. #this needs to be aligned with the schema
# class(invert_events$habitatSubstrate) # some missing. ask John to fill this in. #this needs to be aligned with the schema
#Recommended fields
# class(invert_events$locality)
# class(invert_events$eventRemarks)
meio_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/Meiofauna/Hawaii2017_meiofauna_MarineGEO.xlsx", sheet = "station data", skip=1)
#remove columns that don't appear in MarineGEO schema
meio_events<-meio_events[,-grep(pattern = "^X",x = names(meio_events),perl=T)]
### Format events to schema
#Missing station KANM087!!!
#change all empty columns from logical to character class
meio_events[sapply(meio_events, is.logical)] <- lapply(meio_events[sapply(meio_events, is.logical)], as.character)
#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
meio_events$decimalLatitude<-round(as.numeric(meio_events$decimalLatitude),digits = 4)
meio_events$decimalLongitude<-abs(round(as.numeric(meio_events$decimalLongitude),digits = 4))*-1
meio_events$coordinateUncertaintyInMeters<-100
meio_events$minimumDepthInMeters<-as.numeric(meio_events$minimumDepthInMeters)
meio_events$maximumDepthInMeters<-as.numeric(meio_events$maximumDepthInMeters)
# class(meio_events$maximumDepthInMeters)
# class(meio_events$minimumDepthInMeters)
# class(meio_events$recordedBy) #ask Freya to follow format - add last names, and pipes between names
# class(meio_events$samplingProtocol) #some missing. ask Freya to fill this in
# class(meio_events$habitatGeomorphologicalZone) #some missing. ask Freya to fill this in
# class(meio_events$habitatSubstrate) # some missing. ask Freya to fill this in
# class(meio_events$habitatBiotic)
#Recommended fields
# class(meio_events$locality)
# class(meio_events$eventRemarks)
meio_events$day<-day(meio_events$eventDate)
meio_events$month<-month(meio_events$eventDate)
meio_events$year<-year(meio_events$eventDate)
meio_events$eventDate<-NULL
#format the $^#&* time correctly
meio_events$eventTime<-format(.POSIXct(86400*as.numeric(meio_events$eventTime), "UTC"), "%H:%M")
#meio_events$eventTime<-as.character(parse_date_time(meio_events$eventTime, orders="ymdHMS", tz="HST"),format="%H:%M")
meio_events$eventMedia<-"N"
arms_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/ARMS/MarineGEOHI_bioassessment_master-ARMS.xlsx", sheet = "Station")
### Format events to schema
#change all empty columns from logical to character class
arms_events[sapply(arms_events, is.logical)] <- lapply(arms_events[sapply(arms_events, is.logical)], as.character)
#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
arms_events$decimalLatitude<-round(as.numeric(arms_events$decimalLatitude),digits = 4)
arms_events$decimalLongitude<-abs(round(as.numeric(arms_events$decimalLongitude),digits = 4))*-1
arms_events$coordinateUncertaintyInMeters<-10
arms_events$minimumDepthInMeters<-as.numeric(arms_events$minimumDepthInMeters)
# class(arms_events$geoReferenceProtocol)
# class(arms_events$eventID)
# class(arms_events$maximumDepthInMeters)
# class(arms_events$recordedBy) #ask Laetitia to follow name format
# class(arms_events$samplingProtocol)
# class(arms_events$habitatGeomorphologicalZone)
# class(arms_events$habitatSubstrate)
# class(arms_events$habitatBiotic)
#Recommended fields
# class(arms_events$locality)
# class(arms_events$year)
# class(arms_events$month)
# class(arms_events$day)
arms_events$eventMedia<-"Y" #still need to get these from Laetitia
trans_events<-read_excel(path="/Users/eric/google_drive/MarineGEO/Transects/bioassessment_KANV.xlsx", sheet = "Station")
trans_event_photos<-read_excel(path="/Users/eric/google_drive/MarineGEO/Transects/KANV_benthic-photos_filenames_20170711.xlsx", sheet = "Sheet1")
# event photos
#pop off the eventID into its own field
trans_event_photos$eventID<-sub(pattern="(KANV\\d\\d\\d)_.+",replacement = "\\1", trans_event_photos$eventMedia, perl=T)
#use ddply to lump all eventMedia into a single field, separated by a |
trans_eventMedia<-ddply(trans_event_photos, "eventID", transform, eventMedia = paste(eventMedia, collapse = "|"))
#keep only the first instance of each occurrenceID
trans_eventMedia<-trans_eventMedia[!duplicated(trans_eventMedia$eventID),]
#delete original eventMedia column and join on the new one
trans_events$eventMedia<-NULL
trans_events<-left_join(trans_events, trans_eventMedia, by="eventID")
### Format events to schema
#change all empty columns from logical to character class
trans_events[sapply(trans_events, is.logical)] <- lapply(trans_events[sapply(trans_events, is.logical)], as.character)
#round to 4 decimal digits (~10m uncertainty) and meake sure longitude is negative
trans_events$decimalLatitude<-round(as.numeric(trans_events$decimalLatitude),digits = 4)
trans_events$decimalLongitude<-abs(round(as.numeric(trans_events$decimalLongitude),digits = 4))*-1
trans_events$minimumDepthInMeters<-as.numeric(trans_events$minimumDepthInMeters)
# class(trans_events$eventID)
# class(trans_events$coordinateUncertaintyInMeters)
# class(trans_events$maximumDepthInMeters)
# class(trans_events$recordedBy)
# class(trans_events$samplingProtocol)
# class(trans_events$habitatGeomorphologicalZone)
# class(trans_events$habitatSubstrate)
# class(trans_events$habitatBiotic)
#Recommended fields
# class(trans_events$locality)
# class(trans_events$year)
# class(trans_events$month)
# class(trans_events$day)
trans_events$eventMedia<-"Y"
trans_events$eventTime<-as.character(parse_date_time(trans_events$eventTime, orders="ymdHMS", tz="HST"),format="%H:%M")
events<-full_join(fish_events,algae_events)
events<-full_join(events,invert_events)
events<-full_join(events,meio_events)
events<-full_join(events,arms_events)
events<-full_join(events,trans_events)
# First go through and make sure all stations have Lat/Longs, or as many as possible. Delete secondary lat/longs
#read in the sample data, skipping first 3 lines of other headers. Format all times as hh:mm in Excel, paste into textwrangler if they need homogenization (i.e. multiple formats of times)
fish_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Fish/FISH_DATA_21Jul2017.xlsx",sheet="Samples")
fish_genetic<-read_excel(path="/Users/eric/google_drive/MarineGEO/Fish/FISH_DATA_21Jul2017.xlsx",sheet="Genetic samples")
#these commands apply to the original fish-group template
#remove columns without mapped DwC terms mapped
#fish_samples<-fish_samples[,-grep(pattern = "^X",x = names(fish_samples),perl=T)]
#fish_events<-fish_events[,-grep(pattern = "^X",x = names(fish_events),perl=T)]
#remove records without occurrenceIDs (temp before final dataset)
#fish_samples<-fish_samples[-which(is.na(fish_samples$occurrenceID)),]
#translate fieldIDs to eventIDs
#fish_samples$eventID<-gsub("LRP 17-","KANF0",fish_samples$eventID)
#use ddply to lump all materialSampleIDs into a single field, separated by a |
#fish_samples<-ddply(fish_samples, "occurrenceID", transform, materialSampleID = #paste(materialSampleID, collapse = "|"))
#keep only the first instance of each occurrenceID
#fish_samples<-fish_samples[!duplicated(fish_samples$occurrenceID),]
#change all empty columns from logical to character class
fish_samples[sapply(fish_samples, is.logical)] <- lapply(fish_samples[sapply(fish_samples, is.logical)], as.character)
#Fix the eventIDs
fish_samples$eventID<-str_extract(fish_samples$eventID,"KANF\\d\\d\\d")
fish_samples$otherCatalogNumbers<-NULL # drop this for now - it will be replaced by fish_biorep below
# class(fish_samples[,which(sapply(fish_samples, is.logical))])<-"character"
# class(fish_samples$occurrenceID)
# class(fish_samples$basisofRecord)
# class(fish_samples$catalogNumber)<-"character"
# class(fish_samples$organismScope)
# class(fish_samples$eventID)
# class(fish_samples$scientificName) #eventually parse this into taxon categories?
# class(fish_samples$taxonRank) #using this
# class(fish_samples$individualCount)
# class(fish_samples$institutionID)
# make identifiedBy go firstname lastname
fish_samples$identifiedBy<-str_replace(fish_samples$identifiedBy, pattern="(\\w+), (.+)", replacement="\\2 \\1")
fish_samples$catalogNumber<-as.character(fish_samples$catalogNumber)
colnames(fish_genetic)[5]<-"BiorepositoryID"
colnames(fish_genetic)[6]<-"tissueNotes"
colnames(fish_genetic)[2]<-"occurrenceID"
#use ddply to lump all materialSampleIDs into a single field, separated by a |
fish_biorep<-ddply(fish_genetic, "occurrenceID", transform, otherCatalogNumbers = paste(BiorepositoryID, collapse = "|"), tissueNotes = paste(tissueNotes, collapse="|"))
#keep only the first instance of each occurrenceID
fish_biorep<-fish_biorep[!duplicated(fish_biorep$otherCatalogNumbers),]
fish_biorep$otherCatalogNumbers<-as.character(fish_biorep$otherCatalogNumbers)
#first join the biorep numbers to this data
fish_samples<-left_join(fish_samples,fish_biorep[,c(2,6,7)],by="occurrenceID")
#now join samples and events
fish<-left_join(fish_samples,events,by="eventID")
#optionally make it from the points provided
#bbox<-make_bbox(lon=fish2$decimalLongitude,lat=fish2$decimalLatitude)
#by individual
fish2<-fish %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(count=n())
#by species
fish3 <- fish %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())
#richness
fish4<-fish3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
fish6<-left_join(fish2,fish4)
fish_map<-ggmap(dmap) + geom_point(data = fish6, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=count, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Species Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank())
fish_map
ggsave(fish_map,filename="./output/fish_map.pdf")
#edit eventIDs for capitalization
algae_samples<-read_excel("/Users/eric/google_drive/MarineGEO/algae/MarineGEOHI_bioassessment_master_KANA.xlsx", sheet="Sample")
#change all empty columns from logical to character class
algae_samples[sapply(algae_samples, is.logical)] <- lapply(algae_samples[sapply(algae_samples, is.logical)], as.character)
# class(algae_samples$occurrenceID)
# class(algae_samples$catalogNumber)
# class(algae_samples$otherCatalogNumbers)
# class(algae_samples$organismScope)
# class(algae_samples$eventID)
# class(algae_samples$scientificName) #eventually parse this into taxon categories?
# class(algae_samples$taxonRank) # Melinda needs to populate this...
# class(algae_samples$identifiedBy)
# class(algae_samples$individualCount)
algae_samples$basisofRecord<-"specimen"
algae_samples$institutionID<-"USNM"
algae<-left_join(algae_samples,events,by="eventID")
#by individual
algae2<-algae %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(count=n())
#by species
algae3 <- algae[-which(algae$scientificName=="?"),] %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())
#richness
algae4<-algae3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
algae5<-left_join(algae2,algae4)
#algae5<-algae5[which(is.na(algae5$decimalLatitude)),]
#algae5<-algae5[-which(algae5$scientificName=="?"),]
algae5[4,4]<-1
algae5[11,4]<-2
extra<-algae5[33,]
algae5<-algae5[-33,] #remove outlier
algae_map<-ggmap(dmap) + geom_point(data = algae5, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=count, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank()) + geom_point(data=extra, mapping=aes(x = decimalLongitude, y = decimalLatitude, size=30, color=30))
#add outlier back in as a red dot size=30
algae_map
ggsave(algae_map,filename="./output/algae_map.pdf")
invert_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Inverts/BKANE_063017_FIMS.xlsx", sheet = "Specimen",skip=3)
#sponges will be loaded here, cleaned up, and then joined with the rest of the inverts
sponge_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Sponges/VICENTE_BIOBLITZ2017_SPONGEMETADATA_FIMS.xlsx", sheet = "Sheet1",skip=1)
#remove columns that don't appear in MarineGEO schema
invert_samples<-invert_samples[,-grep(pattern = "^X",x = names(invert_samples),perl=T)]
sponge_samples<-sponge_samples[,-grep(pattern = "^X",x = names(sponge_samples),perl=T)]
#remove records without occurrenceIDs (temp before final dataset)
invert_samples<-invert_samples[-which(is.na(invert_samples$scientificName)),]
sponge_samples<-sponge_samples[-which(is.na(sponge_samples$scientificName)),]
#change all empty columns from logical to character class
invert_samples[sapply(invert_samples, is.logical)] <- lapply(invert_samples[sapply(invert_samples, is.logical)], as.character)
sponge_samples[sapply(sponge_samples, is.logical)] <- lapply(sponge_samples[sapply(sponge_samples, is.logical)], as.character)
sponge_samples$phylum[which(is.na(sponge_samples$phylum))]<-"Porifera"
sponge_samples$eventID<-str_replace(sponge_samples$eventID,pattern="-", replacement="")
invert_samples<-full_join(invert_samples,sponge_samples)
This code will populate taxonRank with the lowest known taxonomic category
invert_samples$taxonRank[which(!is.na(invert_samples$phylum))]<-"phylum"
invert_samples$taxonRank[which(!is.na(invert_samples$class))]<-"class"
invert_samples$taxonRank[which(!is.na(invert_samples$subclass))]<-"subclass"
invert_samples$taxonRank[which(!is.na(invert_samples$order))]<-"order"
invert_samples$taxonRank[which(!is.na(invert_samples$suborder))]<-"suborder"
invert_samples$taxonRank[which(!is.na(invert_samples$superfamily))]<-"superfamily"
invert_samples$taxonRank[which(!is.na(invert_samples$family))]<-"family"
invert_samples$taxonRank[which(!is.na(invert_samples$subfamily))]<-"subfamily"
invert_samples$taxonRank[which(!is.na(invert_samples$genus))]<-"genus"
invert_samples$taxonRank[which(!is.na(invert_samples$species))]<-"species"
invert_samples$taxonRank[grep("sp\\.",invert_samples$species)]<-"genus"
str_replace(string=invert_samples$eventID, pattern="(KAN\\w\\d\\d\\d)[A-Z]",replacement="\\1")
[1] "KANI001" "KANI001" "KANI001" "KANI001" "KANI001"
[6] "KANI001" "KANI001" "KANI001" "KANI001" "KANI001"
[11] "KANI001" "KANI001" "KANI005" "KANI005" "KANI001"
[16] "KANI001" "KANI001" "KANI001" "KANI001" "KANI001"
[21] "KANI001" "KANI001" "KANI001" "KANI001" "KANI001"
[26] "KANI001" "KANI001" "KANI006" "KANI001" "KANI001"
[31] "KANI001" "KANI005" "KANI005" "KANI005" "KANI001"
[36] "KANI006" "KANI006" "KANI006" "KANI005" "KANI006"
[41] "KANI006" "KANI006" "KANI006" "KANI001" "KANI006"
[46] "KANI006" "KANI006" "KANI006" "KANI005" "KANI006"
[51] "KANI005" "KANI006" "KANI006" "KANI003" "KANI003"
[56] "KANF" "KANI006" "KANI006" "KANI006" "KANI006"
[61] "KANI004" "KANI004" "KANI004" "KANI004" "KANI004"
[66] "KANI004" "KANI013" "KANI013" "KANI013" "KANI013"
[71] "KANI012" "KANI012" "KANI012" "KANI012" "KANI012"
[76] "KANI012" "KANI013" "KANI017" "KANI017" "KANI017"
[81] "KANI017" "KANI019" "KANI019" "KANI019" "KANI019"
[86] "KANI019" "KANI019" "KANI019" "KANI021" "KANI021"
[91] "KANI020" "KANI024" "KANI024" "KANI027" "KANI028"
[96] "KANI001" "KANI027" "KANI028" "KANI029" "KANI029"
[101] "KANI027" "KANI028" "KANI027" "KANI028" "KANI028"
[106] "KANI028" "KANI033" "KANI033" "KANI033" "KANI033"
[111] "KANI033" "KANI037" "KANI033" "KANI033" "KANI039"
[116] "KANI041" "KANI043" "KANI041" "KANI041" "KANI049"
[121] "KANA001" "KANI004" "KANI008" "KANI008" "KANI008"
[126] "KANI008" "KANI006" "KANI003" "KANI006" "KANI006"
[131] "KANI006" "KANI005" "KANI005" "KANI008" "KANI006"
[136] "KANI006" "KANI006" "KANI006" "KANI006" "KANI006"
[141] "KANM003" "KANI006" "KANI007" "KANI007" "KANI004"
[146] "KANI009" "KANI009" "KANI007" "KANI007" "KANI007"
[151] "KANI009" "KANI009" "KANI009" "KANI004" "KANI004"
[156] "KANI009" "KANI009" "KANI004" "KANI004" "KANI004"
[161] "KANI004" "KANI004" "KANI004" "KANI004" "KANI004"
[166] "KANI004" "KANI004" "KANI004" "KANI004" "KANI004"
[171] "KANI001" "KANI004" "KANI004" "KANI004" "KANI004"
[176] "KANI004" "KANI004" "KANI004" "KANI009" "KANI009"
[181] "KANI009" "KANI004" "KANI009" "KANI009" "KANI009"
[186] "KANI009" "KANI009" "KANI009" "KANI009" "KANI009"
[191] "KANI009" "KANI004" "KANI004" "KANI004" "KANI004"
[196] "KANI004" "KANI004" "KANI004" "KANI004" "KANI004"
[201] "KANI004" "KANI004" "KANI004" "KANI004" "KANI004"
[206] "KANI004" "KANI004" "KANI004" "KANI004" "KANI004"
[211] "KANI004" "KANI004" "KANI004" "KANI004" "KANI010"
[216] "KANI010" "KANI010" "KANI010" "KANI010" "KANI011"
[221] "KANI010" "KANI004" "KANI004" "KANI004" "KANI013"
[226] "KANI013" "KANI012" "KANI012" "KANI012" "KANI012"
[231] NA "KANI012" "KANI012" "KANI012" "KANI012"
[236] "KANI012" "KANI012" "KANI012" "KANI004" "KANI004"
[241] "KANI012" "KANI012" "KANI013" "KANI004" "KANI004"
[246] "KANM007" "KANM023" "KANI004" "KANI004" "KANI004"
[251] "KANI004" "KANI004" "KANI004" "KANI004" "KANI004"
[256] "KANI013" "KANI013" "KANI015" "KANI006" "KANI004"
[261] "KANI013" "KANI013" "KANI013" "KANI013" "KANI013"
[266] "KANI013" "KANI013" "KANI013" "KANI013" NA
[271] "KANI002" "KANI002" "KANI001" "KANI002" "KANI002"
[276] "KANI002" "KANI002" "KANI002" "KANI013" "KANI013"
[281] "KANI013" "KANA006" "KANA006" "KANA006" "KANA006"
[286] "KANI001" "KANI001" "KANI013" "KANI013" "KANI004"
[291] "KANI013" "KANI015" "KANI015" "KANI015" "KANI015"
[296] "KANI015" "KANI015" "KANI013" "KANI015" "KANI013"
[301] "KANI001" "KANI018" "KANI019" "KANI024" "KANI022"
[306] "KANI035" "KANI034" "KANI037" "KANI035" "KANI034"
[311] "KANI043" NA "KANI046" "KANI050" "KANI048"
[316] "KANI060" "KANI060" "KANI055" "KANI060" "KANI060"
[321] "KANI055" "KANI055" "KANI066" NA "KANI076"
[326] "KANI076" NA NA "KANI013" "KANI013"
[331] "KANI013" "KANI013" "KANI013" "KANI013" "KANI013"
[336] "KANI013" "KANI013" "KANI013" "KANI013" "KANI013"
[341] "KANI015" "KANI013" "KANI013" "KANI015" "KANI015"
[346] "KANI015" "KANI015" "KANI015" "KANI015" "KANI015"
[351] "KANI015" "KANI015" "KANI015" "KANI015" "KANI015"
[356] "KANI012" "KANI012" "KANI014" "KANI014" "KANI013"
[361] "KANI013" "KANI013" "KANI013" "KANI013" "KANI013"
[366] "KANI014" "KANI014" "KANI014" "KANI013" "KANI015"
[371] "KANI015" "KANI015" "KANI015" "KANI015" "KANI015"
[376] "KANI015" "KANI015" "KANI015" "KANI015" "KANI015"
[381] "KANI015" "KANI015" "KANI015" "KANI015" "KANI015"
[386] "KANI015" "KANI015" "KANI015" "KANI015" "KANI015"
[391] "KANI016" "KANI016" "KANI016" "KANI016" "KANI016"
[396] "KANI016" "KANI016" "KANI016" "KANI016" "KANI016"
[401] "KANI016" "KANI016" "KANI016" "KANI016" "KANI016"
[406] "KANI015" "KANI016" "KANI016" "KANI016" "KANI016"
[411] "KANI016" "KANI016" "KANI016" "KANI016" "KANI016"
[416] "KANI016" "KANI016" "KANI016" "KANI016" "KANI016"
[421] "KANI016" "KANI016" "KANI016" "KANI016" "KANI016"
[426] "KANI016" NA "KANI015" "KANI013" "KANI016"
[431] "KANI016" "KANI016" "KANI013" "KANI001" "KANI015"
[436] "KANI016" "KANI013" "KANI013" "KANI004" "KANI004"
[441] "KANM020" "KANM020" "KANM020" "KANM020" "KANM008"
[446] "KANM008" "KANM008" "KANI016" "KANI018" "KANI018"
[451] "KANI018" "KANI018" "KANI018" "KANI018" "KANI018"
[456] "KANM020" "KANM020" "KANI017" "KANI018" "KANM020"
[461] "KANM020" "KANI017" "KANI017" "KANI017" "KANI017"
[466] "KANI017" "KANI017" "KANI017" "KANI018" "KANI017"
[471] "KANI019" "KANI019" "KANI019" "KANI019" "KANI019"
[476] "KANI019" "KANI019" "KANI016" "KANI017" "KANI017"
[481] "KANI017" "KANI017" "KANI017" "KANI017" "KANI017"
[486] "KANI017" "KANI017" "KANI017" "KANI017" "5/24 am ?"
[491] "KANI019" "KANI019" "KANI017" "KANI017" "KANI018"
[496] "KANI017" "KANI017" "KANI017" "KANI017" "KANI017"
[501] "KANI017" "KANI017" "KANI017" "KANI017" "KANI017"
[506] "KANI017" "KANI017" "KANI017" "KANI019" "KANI017"
[511] "KANI017" "KANI017" "KANI017" "KANI019" "KANI017"
[516] "KANI019" "KANI019" "KANI019" "KANI019" "KANI019"
[521] "KANI019" "KANI019" "KANI019" "KANI019" "KANI019"
[526] "KANI019" "KANI019" "KANI019" "KANI019" "KANI019"
[531] "KANI019" "KANI019" "KANI019" "KANI019" "KANI019"
[536] "KANI019" "KANI019" "KANI019" "KANI019" "KANI019"
[541] "KANI019" "KANI019" "KANI018" "KANI019" "KANI019"
[546] "KANI019" "KANI019" "KANI019" "KANI017" "KANI017"
[551] "KANI017" "KANI017" "KANI017" "KANI017" "KANI017"
[556] "KANI017" "KANI017" "KANI017" "KANI017" "KANI017"
[561] "KANI017" "KANI017" "KANI017" "KANI017" "KANI019"
[566] "KANI017" "KANI018" "KANI014" "KANI019" "KANI019"
[571] "KANI019" "T19" "T19" "KANI019" "KANI019"
[576] "KANI019" "KANI019" "KANI019" "KANI020" "KANI020"
[581] "KANI020" "KANI020" "KANI019" "KANI019" "KANI019"
[586] "KANI022" "KANI022" "KANI022" "KANI022" "KANI022"
[591] "KANI022" "KANA016" "KANA016" "KANA016" "KANI022"
[596] "KANI022" "KANI022" "KANI022" "KANI022" "KANI022"
[601] "KANI022" "MXW002" "MXW002" "MXW002" "MXW002"
[606] "MXW001" "MXW001 or 002" "MXW001 or 002" "KANI020" "KANI020"
[611] "KANI020" "KANI020" "KANI020" "KANI020" "KANI020"
[616] "KANI020" "KANI020" "KANI020" "KANI020" "KANI020"
[621] "KANI020" "KANI020" "KANI020" "KANI020" "KANI020"
[626] "KANI022" "KANI020" "KANI020" "KANI022" "KANI019"
[631] "KANI019" "KANI019" "KANI019" "KANI019" "KANI020"
[636] "KANI020" "KANM031" "KANI023" "KANI023" "KANI022"
[641] "KANI020" "KANI020" "KANI020" "KANI020" "KANI023"
[646] "KANI023" "KANI023" "KANI023" "KANI023" "KANA016"
[651] "KANA016" "KANI022" "KANI022" "KANI022" "KANI022"
[656] "KANI022" "KANI022" "KANI021" "KANI021" "KANI021"
[661] "KANI021" "KANI024" "KANI021" "KANI022" "KANI021"
[666] "KANI021" "KANI024" "KANI019" "KANI024" "KANI024"
[671] "KANI019" "KANI022" "KANM031" "KANI024" "KANI019"
[676] "KANI019" "KANI019" "KANI024" "KANI024" "KANI019"
[681] "KANI019" "KANI022" "KANI022" "KANI019" "KANI019"
[686] "KANI019" "KANI024" "KANI022" "KANI022" "KANI022"
[691] "KANI022" "KANI022" "KANI024" "KANI024" "KANI022"
[696] "KANI022" "KANI022" "KANI022" "KANI021" "KANI022"
[701] "KANI022" "KANI024" "KANI024" "KANI024" "KANI022"
[706] "KANI022" "KANI022" "KANI022" "KANI022" "KANI022"
[711] "KANI022" "KANI022" "KANI022" "KANI022" "KANI022"
[716] "KANI022" "KANI022" "KANI022" "KANI022" "KANI024"
[721] "KANI024" "KANI024" "KANI022" "KANI022" "KANI022"
[726] "KANI022" "KANI022" "KANI022" "KANI024" "KANI022"
[731] "KANI024" "KANI024" "KANI022" "KANI022" "KANI024"
[736] "KANI024" "KANI024" "KANI024" "KANI024" "KANI024"
[741] "KANI024" "KANI024" "KANI024" "KANI024" "KANI024"
[746] "KANI024" "KANI024" "KANI024" "KANI024" "KANI024"
[751] "KANI024" "KANI024" "KANI024" "KANI024" "KANI024"
[756] "KANI024" "KANI024" "KANI024" "KANI024" "KANI024"
[761] "KANI024" "KANI024" "KANI024" "KANI024" "KANI024"
[766] "KANI024" "KANI024" "KANI024" "KANI022" "KANI024"
[771] "KANI023" "KANI023" "KANI014" "KANI014" "KANI014"
[776] "KANI014" "KANI014" "KANI022" "KANI019" "KANI019"
[781] "KANI019" "KANI019" "KANI019" "KANI019" "KANI019"
[786] "KANI019" "KANI019" "KANI019" "KANI019" "KANI019"
[791] "KANI019" "KANI024" "KANI024" "KANI024" "KANI024"
[796] "KANI019" "KANI019" "KANI025" "KANM037" "KANI027"
[801] "KANI027" "KANI027" "KANI027" "KANI027" "KANI027"
[806] "KANI027" "KANI027" "KANI027" "KANI027" "KANI027"
[811] "KANI027" "KANI027" "KANI027" "KANI027" "KANI027"
[816] "KANI027" "KANI027" "KANI027" "KANI027" "KANI027"
[821] "KANI001" "KANI027" "KANI027" "KANI027" "KANI027"
[826] "KANI027" "KANI027" "KANI027" "KANI027" "KANI027"
[831] "KANA025" "KANA025" "KANA025" "KANI029" "KANI029"
[836] "KANI029" "KANI029" "KANI029" "KANI029" "KANI029"
[841] "KANI028" "KANI028" "KANI028" "KANI028" "KANI028"
[846] "KANI028" "KANI028" "KANI029" "KANI029" "KANBZ001"
[851] "KANBZ002" "KANBZ003" "KANBZ004" "KANBZ005" "KANBZ006"
[856] "KANBZ007" "KANBZ008" "KANBZ008" "KANBZ010" "KANBZ013"
[861] "KANBZ014" "KANI001" "KANI031" "MXW003" "MXW003"
[866] "KANI025" "KANI025" "KANI001" "KANI024" "KANI028"
[871] "KANI032" "KANI032" "KANI032" "KANI032" "KANI032"
[876] "KANI032" "KANI032" "KANI032" "KANI032" "KANI032"
[881] "KANF019" "KANF019" "KANF019" "KANF019" "KANI032"
[886] "KANI001" "KANI030" "KANI030" "KANI030" "KANI028"
[891] "KANI029" "KANI029" "KANI029" "KANI029" "KANI029"
[896] "KANI030" "KANI027" "KANI027" "KANI030" "KANI027"
[901] "KANI027" "KANI027" "KANI027" "KANI027" "KANI027"
[906] "KANI027" "KANI027" "KANI027" "KANI027" "KANI027"
[911] "KANI027" "KANI027" "KANI027" "KANI030" "KANI028"
[916] "KANI030" "KANI030" "KANI027" "KANI024" "KANI024"
[921] "KANI024" "KANI024" "KANI024" "KANI024" "KANI022"
[926] "KANI022" "KANI022" "KANI022" "KANI022" "KANI022"
[931] "KANI030" "KANI030" "KANI024" "KANI024" "KANI024"
[936] "KANI022" "KANI022" "KANI030" "KANI030" "KANI030"
[941] "KANI030" "KANI030" "KANI030" "KANI030" "KANI030"
[946] "KANI030" "KANI030" "KANI030" "KANI030" "KANI025"
[951] "KANI025" "KANI025" "KANI025" "KANI001" "KANI035"
[956] "KANI035" "KANI035" "KANI035" "KANI035" "KANI035"
[961] "KANI035" "KANI034" "KANI034" "KANI034" "KANI034"
[966] "KANI034" "KANI034" "KANI034" "KANI030" "KANI030"
[971] "KANI030" "KANI033" "KANI033" "KANI000" "KANI037"
[976] "KANI037" "KANI037" "KANI037" "KANI021" "KANI021"
[981] "KANI021" "KANI021" "KANI019" "KANI021" "KANI021"
[986] "KANI021" "KANI021" "KANI021" "KANI021" "KANI021"
[991] "KANI023" "KANI023" "KANI023" "KANI023" "KANI023"
[996] "KANI023" "KANI023" "KANI022" "KANI023" "KANI029"
[ reached getOption("max.print") -- omitted 1488 entries ]
invert<-left_join(invert_samples,events,by="eventID")
#by individual
invert2<-invert %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T))
#by species
invert3 <- invert %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())
#richness
invert4<-invert3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
invert5<-left_join(invert2,invert4)
invert_map<-ggmap(dmap) + geom_point(data = invert5[-c(length(invert5$richness)-1,length(invert5$richness)),], mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank())
invert_map
ggsave(invert_map,filename="./output/invert_map.pdf")
#create a vector of the eventIDs not finding a match in the events database
bad_eventIDs<-invert$eventID[which(is.na(invert$decimalLatitude))]
sponge<-invert[which(invert$phylum=="Porifera"),]
#by individual
sponge2<-sponge %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T))
#by species
sponge3 <- sponge %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())
#richness
sponge4<-sponge3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
sponge5<-left_join(sponge2,sponge4)
sponge_map<-ggmap(dmap) + geom_point(data = sponge5[-c(61,62),], mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank())
sponge_map
ggsave(sponge_map,filename="./output/sponge_map.pdf")
#note meio data still need some cleaning, including importing some stations from inverts and fish, and fixing up eventIDs to have 3 digits instead of 2. There are 27 specimens that do not have station information because of this.
meio_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Meiofauna/Hawaii2017_meiofauna_MarineGEO.xlsx", sheet = "specimen data",skip=1)
#remove columns that don't appear in MarineGEO schema
meio_samples<-meio_samples[,-grep(pattern = "^X",x = names(meio_samples),perl=T)]
#add occurrence IDs to one investigators samples - this has been taken care of now
#meio_samples$occurrenceID[which(is.na(meio_samples$occurrenceID))]<-"UJ"
#make occurrence IDs unique
#meio_samples$occurrenceID<-make.unique(meio_samples$occurrenceID, sep="_")
#change all empty columns from logical to character class
meio_samples[sapply(meio_samples, is.logical)] <- lapply(meio_samples[sapply(meio_samples, is.logical)], as.character)
meio_samples$basisofRecord<-"specimen" #check with Frey that this is correct
meio_samples$institutionID<-"USNM"
# class(meio_samples$scientificName) #eventually parse this into taxon categories?
# class(meio_samples$taxonRank) #using this
# class(meio_samples$occurrenceID)
# class(meio_samples$catalogNumber)
# class(meio_samples$otherCatalogNumbers)
# class(meio_samples$organismScope)
# class(meio_samples$eventID)
# class(meio_samples$identifiedBy)
# class(meio_samples$individualCount)
meio<-left_join(meio_samples,meio_events,by="eventID")
#by individual
meio2<-meio %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T))
#by species
meio3 <- meio %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())
#richness
meio4<-meio3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
meio5<-left_join(meio2,meio4)
meio_map<-ggmap(dmap) + geom_point(data = meio5, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank())
meio_map
ggsave(meio_map,filename="./output/meio_map.pdf")
arms_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/ARMS/MarineGEOHI_bioassessment_master-ARMS.xlsx", sheet = "Sample")
#change all empty columns from logical to character class
arms_samples[sapply(arms_samples, is.logical)] <- lapply(arms_samples[sapply(arms_samples, is.logical)], as.character)
# class(arms_samples$occurrenceID)
# class(arms_samples$basisofRecord)
# class(arms_samples$catalogNumber)
# class(arms_samples$otherCatalogNumbers)
# class(arms_samples$organismScope) #were there some slurries too?
# class(arms_samples$eventID)
# class(arms_samples$scientificName) #eventually parse this into taxon categories?
# class(arms_samples$taxonRank) #using this
# class(arms_samples$identifiedBy)
# class(arms_samples$individualCount)
# change NAs for individualCount to 1 for now
arms_samples$individualCount[which(is.na(arms_samples$individualCount))]<-1
arms_samples$institutionID<-"USNM"
arms<-left_join(arms_samples,arms_events,by="eventID")
#by individual
arms2<-arms %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T))
#by species
arms3 <- arms %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n())
#richness
arms4<-arms3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
arms5<-left_join(arms2,arms4)
arms_map<-ggmap(dmap) + geom_point(data = arms5, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Type Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank())
arms_map
ggsave(arms_map,filename="./output/arms_map.pdf")
trans_samples<-read_excel(path="/Users/eric/google_drive/MarineGEO/Transects/bioassessment_KANV.xlsx", sheet = "Sample")
trans_organism_photos<-read_excel(path="/Users/eric/google_drive/MarineGEO/Transects/KANV_photo-vouchers_20170712.xlsx", sheet = "Sheet1")
# organism photos
## delete the field to be added later
trans_samples$associatedMedia<-NULL
#pop off the eventID again into its own field
trans_organism_photos$eventID<-sub(pattern="(KANV\\d\\d\\d)_.+",replacement = "\\1", trans_organism_photos$associatedMedia, perl=T)
#pop off the species name into its own field
trans_organism_photos$scientificName<-sub(pattern=".+_(\\d{8})_(\\w+-[a-z]+)_.+", replacement = "\\2", trans_organism_photos$associatedMedia,perl=T)
trans_organism_photos$scientificName<-sub(pattern="-", replacement=" ",x = trans_organism_photos$scientificName)
#pop off the initials of the collector, and replace with full name
trans_organism_photos$identifiedBy<-str_extract(pattern="ZF|RW", string=trans_organism_photos$associatedMedia)
trans_organism_photos$identifiedBy[which(trans_organism_photos$identifiedBy=="ZF")]<-"Zach Foltz"
trans_organism_photos$identifiedBy[which(trans_organism_photos$identifiedBy=="RW")]<-"Ross Whippo"
#use ddply to lump all associatedMedia into a single field, separated by a |
trans_associatedMedia<-ddply(trans_organism_photos, c("eventID","scientificName","identifiedBy"), transform, associatedMedia = paste(associatedMedia, collapse = "|"))
#remove all but the first instance
trans_associatedMedia<-trans_associatedMedia[!duplicated(trans_associatedMedia$associatedMedia),]
trans_samples<-left_join(trans_samples, trans_associatedMedia, by=c("eventID","scientificName","identifiedBy"))
#change all empty columns from logical to character class
trans_samples[sapply(trans_samples, is.logical)] <- lapply(trans_samples[sapply(trans_samples, is.logical)], as.character)
# class(trans_samples$occurrenceID)
# class(trans_samples$basisofRecord)
# class(trans_samples$catalogNumber)
# class(trans_samples$otherCatalogNumbers)
# class(trans_samples$organismScope)
# class(trans_samples$eventID)
# class(trans_samples$scientificName) #eventually parse this into taxon categories?
# class(trans_samples$taxonRank) #using this
# class(trans_samples$identifiedBy)
# class(trans_samples$individualCount)
trans_samples$institutionID<-"USNM"
trans<-left_join(trans_samples,trans_events,by="eventID")
#by individual
trans2<-trans %>% group_by(decimalLatitude, decimalLongitude) %>% summarize(individualCount=sum(individualCount,na.rm = T))
#by species
trans3 <- trans %>% group_by(scientificName, decimalLatitude, decimalLongitude) %>% summarize(count=n(), individualCount=sum(individualCount,na.rm=T))
#richness
trans4<-trans3 %>% group_by(decimalLatitude,decimalLongitude) %>% summarize(richness=n())
trans5<-left_join(trans2,trans4)
trans_map<-ggmap(dmap) + geom_point(data = trans5, mapping = aes(x = decimalLongitude, y = decimalLatitude, size=individualCount, color=richness)) + scale_color_gradient(low = "green", high="red") + guides(color=guide_colorbar(title="Species Richness",), size=guide_legend(title="Individual Count")) + theme(axis.title=element_blank())
trans_map
ggsave(trans_map,filename="./output/trans_map.pdf")
Now to join everything into one monster database
WriteXLS(c(MarineGEOHI2,as.data.frame(events)),ExcelFileName = "./output/MarineGEOHI_data_1.1.xlsx",SheetNames=c("Events","Samples"))
Error in WriteXLS(c(MarineGEOHI2, as.data.frame(events)), ExcelFileName = "./output/MarineGEOHI_data_1.1.xlsx", :
One or more of the objects named in 'x' is not a data frame or does not exist